##########################################
# R语言入门与回归分析基础
# 严海兵 华东政法大学政治学与公共管理学院
##########################################
### 内容提要
### 1.下载安装
### 2.命令、对象和函数
### 3.数据类型与数据结构
### 4.读入、查看和保存数据
### 5.数据处理
### 6.数据可视化
### 7.回归分析
### 1.下载安装
## 下载R软件:
# 官方网站the Comprehensive R Archive Network (CRAN): https://cran.r-project.org
# Windows用户点击 "Download R for Windows", 然后点击"base", 然后再点击"Download R x.x.x for Windows"
# Mac用户点击"Download R for (Mac) OS X", 然后点击"R-x.x.x.pkg"
# 注意:如果下载速度比较慢,你可以从本地镜像中下载
# 方法:点击CRAN网站首页左上角的 "Mirrors", 然后找到中国选择一个镜像下载。
## 安装R:
# 对于windows用户, 双击“R-x.x.x-win.exe”, 然后按照下列步骤安装:
# 选择语言(建议选英语)--> click OK --> click Next --> click Next
# --> Select destination Location (change “C:\Program Files\R\R-x.x.x” into “C:\Program Files\R\”) and click Next
# --> select Core Files, 64-bit Files(如果电脑是32位系统则选32-bit Files), and Message translation, then click Next
# --> select “No” and click Next --> select start menu folder and click Next
# --> select additional tasks and click Next --> click Finish
# 对于Mac用户, 双击 “R-x.x.x.pkg”, 然后按照下列步骤安装:
# click Continue --> click Continue --> click Continue --> click Continue
# --> click Agree --> click Install --> click Close to finish the installation
# 熟悉R:
# 打开刚才安装的R软件
# 输入代码 print("Hello World") 或"Hello World"
# 要重复相同的代码,不必重新输入,只需要按光标上键,然后回车
## 安装RStudio:
# 从以下官网下载RStudio: https://www.rstudio.com, 然后安装
# 设置本地镜像:点击Tools --> Global Options --> Packages --> Change --> 选择一个中国镜像
# 更改界面显示:点击Tools --> Global Options --> Appearance
# 更改窗口排列:点击Tools --> Global Options --> Pane Layout
### 2. 命令、对象和函数
## 基础命令
# 作为R命令的数学运算
3 + 3
## [1] 6
3 * 3
## [1] 9
3 ^ 3
## [1] 27
# 运算符号:
# 加号(+);
# 减号(-);
# 乘号(*);
# 除号(/);
# 乘方(^)
# 运算顺序:
# 首先算括号, 其次算乘方,然后算乘除,最后算加减
6 + 15 * 22
## [1] 336
(6 + 15) * 22
## [1] 462
22 ^ 2 + 7
## [1] 491
22 ^ (2 + 7)
## [1] 1.207269e+12
# 科学计数法: 1.207269e + 12 = 1.207269 × 10^12 = 1, 207, 269, 000, 000
## 对象(Objects)
# 对象是用来存储数据的名称(又称变量)
# 例如,把3+3的结果保存为x,x就是一个对象
x <- 3 + 3
# 输入x就会返回这个对象的结果
x
## [1] 6
# 注意:
# (1) <- 又叫赋值运算符,等号(=)也可以用来赋值,但不提倡,因为容易造成混淆
# (2) Rstudio中输入<-的快捷方式:mac为同时按option和减号,win为同时按alt和减号
# (3) 在R中对象名称的字母是区分大小写的,比如大写X和小写x表示的是不同的对象
# (4) 当用同样的名称保存另外一个对象时,之前保存的对象就会被替换
x <- 3 + 2
x
## [1] 5
x2 <- (5 + 1) ^ 2
x2
## [1] 36
x + x2
## [1] 41
x2 / x
## [1] 7.2
# 保存两个对象运算的结果:
x3 <- x2 / x
x3
## [1] 7.2
# 练习1
# (1) 把数字7保存为对象z1;
# (2) 把10+5保存为对象z2;
# (3) 把z1乘z2保存为对象z3;
# (4) 对z3减1进行平方。
# 保存并直接输出结果
n1 <- 2 * 6; n1
## [1] 12
(n2 <- 24 / 3)
## [1] 8
# 注意:R对象的名称可以由字母 (a-z, A-Z)、数字 (0-9)、点 (.)和下划线 (_)组成。
# 第一个元素必须是字母或点
# 三种通常的命名方式:
# (1) total_income
# (2) total.income
# (3) totalIncome
## 函数(Functions)
# R拥有大量内置的函数,函数由函数名、圆括号和参数(argument)组成。
log(27) # 27的自然对数
## [1] 3.295837
sqrt(225) # 225的平方根
## [1] 15
abs(-9) # -9的绝对值
## [1] 9
# 函数和对象可以同时使用:
y1 <- 20 + 5
sqrt(y1 + 200)
## [1] 15
# 用函数建立一个新的对象:
y2 <- sqrt(y1 + 1200)
y2
## [1] 35
# 用打印函数输出结果
print(y2)
## [1] 35
# 包含参数的函数:
round(x = 14.5378, digits = 2)
## [1] 14.54
round(14.5378, 1) # 简化书写
## [1] 14.5
# 参数的默认值:
round(14.5378)
## [1] 15
# 函数的嵌套运用
round(sqrt(20), 2)
## [1] 4.47
# function()也是一个函数
new_func <- function(x){
(x + 5) * 6
}
new_func(2)
## [1] 42
# 创建一个n以内数字连续相加的函数
sum_func <- function(n){
x <- 0
for(i in 1:n)
x = x + i
print(x)
}
sum_func(100)
## [1] 5050
# 练习2
# 使用我们前面练习中存储的对象z1, z2和z3做下面的练习:
# (1) 用函数log10()来计算以10为底的z1的对数;
# (2) 用函数sqrt()来计算z2的平方根;
# (3) 用函数sum()来计算z1,z2与z3的和(提示:在括号中用逗号把三个对象隔开);
# (4) 把66.83596存储为对象z4,然后用函数round()保留小数点后三位数。
## 正态分布函数
# 随机生成30个正态分布的数字
rnorm(30)
## [1] -0.02904121 0.64783878 -0.45181684 1.97815402 0.44764250
## [6] 0.32306779 -0.81529937 1.93400570 0.12998041 -0.89175925
## [11] -0.19041786 0.17364719 -0.97593036 -0.53539099 0.65129563
## [16] -1.35776274 1.72278031 0.15262557 1.15766988 -1.05275524
## [21] -0.79137832 -1.11829970 1.34530304 -0.98634110 -0.80232575
## [26] -0.66748559 -0.85004735 0.16331619 -1.47237205 -0.39387200
# 设定种子使结果可复现
set.seed(321)
my_norm <- rnorm(30); my_norm
## [1] 1.70490322 -0.71203856 -0.27798491 -0.11964902 -0.12396062
## [6] 0.26818377 0.72684149 0.23313541 0.33911388 -0.55191467
## [11] 0.34770136 1.48459181 0.18832552 2.44325982 -1.15343949
## [16] -0.80467168 0.45606915 0.42033257 0.57758450 0.44635606
## [21] 0.91725553 -0.10706154 0.98833540 -1.07223880 -0.75801528
## [26] 0.09500072 -2.33093117 0.41751598 -1.12032742 -0.47468470
# 用hist()函数绘制直方图
hist(my_norm)

# 查看函数的参数(arguments)
help(rnorm)
# 或者用问号
?rnorm
my_data <- rnorm(100, mean = 5, sd = 3); my_data
## [1] 0.4087635 6.2471445 6.9025938 8.6925421 4.5363090 5.3436183
## [7] -1.6787899 9.9073710 4.5220378 5.0848024 0.3900401 5.2344734
## [13] 5.7511874 5.7322616 7.3992147 6.0242287 5.7761530 7.8616359
## [19] 8.4092442 2.0471359 2.2710361 2.3327239 4.2519794 5.9625528
## [25] 3.0038918 1.9732504 6.0236048 6.1651935 10.3104926 5.6379864
## [31] 9.3860519 5.1654673 8.7445352 2.0058746 8.5127430 -2.4897467
## [37] 3.8205627 4.5096688 7.1264683 1.5131587 2.0223276 3.8838382
## [43] 4.7315165 6.1098138 6.4832296 4.4715424 6.7702906 9.2180603
## [49] 6.7023223 5.2225075 2.6491256 6.1992028 5.3603921 3.0333240
## [55] 8.9812296 1.3670394 -0.4721249 2.8140450 4.7844491 4.7552152
## [61] 0.5682751 4.3471364 7.2002030 0.9758089 2.7089801 11.8635250
## [67] 6.8955875 7.0949944 5.0539836 2.7622322 1.0895282 2.2438911
## [73] 8.3998980 7.4535358 1.4772297 4.7346048 7.9668772 6.9718932
## [79] 4.3593921 3.9717575 0.3089843 6.5590692 5.7485312 6.2740564
## [85] 9.6300191 6.0126479 4.1671842 5.4993752 1.8932725 4.1586340
## [91] 7.8904932 2.5894741 2.8658086 1.8995941 6.7715629 7.7619308
## [97] 5.7097055 8.4391624 5.6874211 3.2851195
hist(my_data)

# 设置绘图函数的参数
?hist
hist(my_data, # 设置数据
col = "lightblue", # 填充条的颜色
border = "black", # 设置条边框的颜色
main = "Histogram of 100 Random Numbers", # 设置主标题
xlab = "Random Numbers") # 设置横坐标的名称
# 练习3
# 使用runif()函数生成100个均匀分布的数字并保存为vec_unif
# 然后用vec_unif绘制一个直方图,并添加颜色
### 3. 数据类型与数据结构
# 查看数据类型
class(my_data)
## [1] "numeric"
# 向量(vector):相同数据类型构成的一维数据结构
is.vector(my_data)
## [1] TRUE
# 创建三个不同类型的向量
# 字符型向量
name <- c("Zhao", "Qian", "Sun", "Li")
class(name)
## [1] "character"
# 数值型向量
score <- c(517, 413, 306, 489)
class(score)
## [1] "numeric"
typeof(score)
## [1] "double"
# 逻辑型向量
pass <- c(TRUE, FALSE, FALSE, TRUE)
class(pass)
## [1] "logical"
# 利用向量创建一个数据框(data frame)
# 数据框:由行和列组成的二维数据结构,每列的数据类型必须相同
student <- data.frame(name, score, pass); student
## name score pass
## 1 Zhao 517 TRUE
## 2 Qian 413 FALSE
## 3 Sun 306 FALSE
## 4 Li 489 TRUE
# 查看数据结构
str(student)
## 'data.frame': 4 obs. of 3 variables:
## $ name : Factor w/ 4 levels "Li","Qian","Sun",..: 4 2 3 1
## $ score: num 517 413 306 489
## $ pass : logi TRUE FALSE FALSE TRUE
# Factor(因子)是一种特殊的向量,用来存储类别变量
# data.frame()函数创建数据框时自动把字符型向量转变成因子
# 可以通过设置参数阻止转换
student <- data.frame(name, score, pass, stringsAsFactors = FALSE)
str(student)
## 'data.frame': 4 obs. of 3 variables:
## $ name : chr "Zhao" "Qian" "Sun" "Li"
## $ score: num 517 413 306 489
## $ pass : logi TRUE FALSE FALSE TRUE
# 美元符号$: 选取某个变量
student$name
## [1] "Zhao" "Qian" "Sun" "Li"
student$score
## [1] 517 413 306 489
### 4. 读入、查看和保存数据
# R包是拓展R功能的软件(代码库),可以通过install.packages()函数从官网CRAN下载各种包
# R和R包的关系类似手机和手机APP的关系
# 下载gapminder数据包
# install.packages("gapminder")
# 加载/调用R包
library(gapminder)
# 或者
require(gapminder)
## 查看gapminder数据信息
str(gapminder) # 查看数据结构
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
names(gapminder) # 查看变量名称
## [1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
head(gapminder) # 查看前6行数据
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
tail(gapminder) # 查看最后6行数据
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Zimbabwe Africa 1982 60.4 7636524 789.
## 2 Zimbabwe Africa 1987 62.4 9216418 706.
## 3 Zimbabwe Africa 1992 60.4 10704340 693.
## 4 Zimbabwe Africa 1997 46.8 11404948 792.
## 5 Zimbabwe Africa 2002 40.0 11926563 672.
## 6 Zimbabwe Africa 2007 43.5 12311143 470.
head(gapminder, 3) # 查看前3行数据
## # A tibble: 3 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
?gapminder # 查看说明文档
## 查看基本的统计信息
summary(gapminder) # 查看所有变量的基本统计信息
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
summary(gapminder$lifeExp) # 查看某个变量的基本统计信息
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.60 48.20 60.71 59.47 70.85 82.60
mean(gapminder$gdpPercap, na.rm = TRUE) # 计算某个变量的均值
## [1] 7215.327
table(gapminder$continent) # 计算类别变量的频数
##
## Africa Americas Asia Europe Oceania
## 624 300 396 360 24
table(gapminder$year) # 计算年份的频数
##
## 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
## 142 142 142 142 142 142 142 142 142 142 142 142
prop.table(table(gapminder$continent)) # 计算各类别频数的占比
##
## Africa Americas Asia Europe Oceania
## 0.36619718 0.17605634 0.23239437 0.21126761 0.01408451
quantile(gapminder$lifeExp) # 计算变量的分位数(0%,25%,50%,75%和100%)
## 0% 25% 50% 75% 100%
## 23.5990 48.1980 60.7125 70.8455 82.6030
quantile(gapminder$lifeExp, 0.85) # 计算变量的85%分位数
## 85%
## 73.4755
quantile(gapminder$lifeExp, seq(0.1, 1, 0.1)) # 计算间隔为10%的分位数
## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 41.5108 45.8992 50.6021 55.7292 60.7125 66.0814 69.7465 72.0288 75.0970
## 100%
## 82.6030
## 读入外部数据
# readr: 读取csv和fwf文档
# readxl:读取xls和xlsx文档
# haven:读取SAS、SPSS和Stata文档
# install.packages(c("readr","readxl"))
library(readr)
library(readxl)
# 将gapminder保存为csv文档
write_csv(gapminder, "gapminder.csv")
# 读入csv文档
read_csv("gapminder.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## continent = col_character(),
## year = col_double(),
## lifeExp = col_double(),
## pop = col_double(),
## gdpPercap = col_double()
## )
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
# 将csv另存为excel格式
# 读入excel文档
read_excel("gapminder.xlsx")
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
### 5. 数据处理
## tidyverse
# tidyverse既是一种R编程风格, 也是一组R包的集合
# 作为R包集合, tidyverse包含的核心包如下:
# ggplot2 - 数据可视化
# dplyr - 数据转换
# tidyr - 数据整洁(清洗)
# readr - 读入矩形数据(csv,tsv和fwf格式的数据)
# purrr - 函数编程
# tibble - 新型数据框
# stringr - 字符数据处理
# forcats - 因子数据处理
# 整体安装和加载tidyverse系列包
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ ggplot2 3.2.0 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()

## tibble
# 不把字符自动转成因子
tibble(name = c("Zhao", "Qian", "Sun", "Li"),
score = c(517, 413, 306, 489),
pass = c(TRUE, FALSE, FALSE, TRUE))
## # A tibble: 4 x 3
## name score pass
## <chr> <dbl> <lgl>
## 1 Zhao 517 TRUE
## 2 Qian 413 FALSE
## 3 Sun 306 FALSE
## 4 Li 489 TRUE
## dplyr
# dplyr是tidyverse系列包的核心组成部分,主要用于数据转换(管理)
# dplyr包的核心函数:
# select(), rename(), filter(), arrange(), mutate(), summarize()以及%>%
# 也可以单独加载dplyr包
# install.packages("dplyr")
# library(dplyr)
## select(): 选取特定变量的数据
select(gapminder, country, year, lifeExp) # 选取country, year和lifeExp
## # A tibble: 1,704 x 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Afghanistan 1952 28.8
## 2 Afghanistan 1957 30.3
## 3 Afghanistan 1962 32.0
## 4 Afghanistan 1967 34.0
## 5 Afghanistan 1972 36.1
## 6 Afghanistan 1977 38.4
## 7 Afghanistan 1982 39.9
## 8 Afghanistan 1987 40.8
## 9 Afghanistan 1992 41.7
## 10 Afghanistan 1997 41.8
## # … with 1,694 more rows
select(gapminder, country:lifeExp) # 选取从country到lifeExp的所有变量
## # A tibble: 1,704 x 4
## country continent year lifeExp
## <fct> <fct> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8
## 2 Afghanistan Asia 1957 30.3
## 3 Afghanistan Asia 1962 32.0
## 4 Afghanistan Asia 1967 34.0
## 5 Afghanistan Asia 1972 36.1
## 6 Afghanistan Asia 1977 38.4
## 7 Afghanistan Asia 1982 39.9
## 8 Afghanistan Asia 1987 40.8
## 9 Afghanistan Asia 1992 41.7
## 10 Afghanistan Asia 1997 41.8
## # … with 1,694 more rows
select(gapminder, -(lifeExp:gdpPercap)) # 选取从lifeExp到gdpPercap之外的所有变量
## # A tibble: 1,704 x 3
## country continent year
## <fct> <fct> <int>
## 1 Afghanistan Asia 1952
## 2 Afghanistan Asia 1957
## 3 Afghanistan Asia 1962
## 4 Afghanistan Asia 1967
## 5 Afghanistan Asia 1972
## 6 Afghanistan Asia 1977
## 7 Afghanistan Asia 1982
## 8 Afghanistan Asia 1987
## 9 Afghanistan Asia 1992
## 10 Afghanistan Asia 1997
## # … with 1,694 more rows
select(gapminder, 1, 3, 4) # 选取第1、3、4行的变量
## # A tibble: 1,704 x 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Afghanistan 1952 28.8
## 2 Afghanistan 1957 30.3
## 3 Afghanistan 1962 32.0
## 4 Afghanistan 1967 34.0
## 5 Afghanistan 1972 36.1
## 6 Afghanistan 1977 38.4
## 7 Afghanistan 1982 39.9
## 8 Afghanistan 1987 40.8
## 9 Afghanistan 1992 41.7
## 10 Afghanistan 1997 41.8
## # … with 1,694 more rows
select(gapminder, -c(2, 4)) # 选取第2、4行以外的变量
## # A tibble: 1,704 x 4
## country year pop gdpPercap
## <fct> <int> <int> <dbl>
## 1 Afghanistan 1952 8425333 779.
## 2 Afghanistan 1957 9240934 821.
## 3 Afghanistan 1962 10267083 853.
## 4 Afghanistan 1967 11537966 836.
## 5 Afghanistan 1972 13079460 740.
## 6 Afghanistan 1977 14880372 786.
## 7 Afghanistan 1982 12881816 978.
## 8 Afghanistan 1987 13867957 852.
## 9 Afghanistan 1992 16317921 649.
## 10 Afghanistan 1997 22227415 635.
## # … with 1,694 more rows
select(gapminder, starts_with("c")) # 选取以"c"开头的变量
## # A tibble: 1,704 x 2
## country continent
## <fct> <fct>
## 1 Afghanistan Asia
## 2 Afghanistan Asia
## 3 Afghanistan Asia
## 4 Afghanistan Asia
## 5 Afghanistan Asia
## 6 Afghanistan Asia
## 7 Afghanistan Asia
## 8 Afghanistan Asia
## 9 Afghanistan Asia
## 10 Afghanistan Asia
## # … with 1,694 more rows
select(gapminder, ends_with("p")) # 选取以"p"结尾的变量
## # A tibble: 1,704 x 3
## lifeExp pop gdpPercap
## <dbl> <int> <dbl>
## 1 28.8 8425333 779.
## 2 30.3 9240934 821.
## 3 32.0 10267083 853.
## 4 34.0 11537966 836.
## 5 36.1 13079460 740.
## 6 38.4 14880372 786.
## 7 39.9 12881816 978.
## 8 40.8 13867957 852.
## 9 41.7 16317921 649.
## 10 41.8 22227415 635.
## # … with 1,694 more rows
select(gapminder, contains("o")) # 选取包含"o"的变量
## # A tibble: 1,704 x 3
## country continent pop
## <fct> <fct> <int>
## 1 Afghanistan Asia 8425333
## 2 Afghanistan Asia 9240934
## 3 Afghanistan Asia 10267083
## 4 Afghanistan Asia 11537966
## 5 Afghanistan Asia 13079460
## 6 Afghanistan Asia 14880372
## 7 Afghanistan Asia 12881816
## 8 Afghanistan Asia 13867957
## 9 Afghanistan Asia 16317921
## 10 Afghanistan Asia 22227415
## # … with 1,694 more rows
## rename(): 重新命名变量
rename(gapminder, life_expectancy = lifeExp, population = pop)
## # A tibble: 1,704 x 6
## country continent year life_expectancy population gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
# 注意: select() 也可以对变量重新命名,但是会抛弃未重新命名的其他变量
select(gapminder, life_expectancy = lifeExp, population = pop)
## # A tibble: 1,704 x 2
## life_expectancy population
## <dbl> <int>
## 1 28.8 8425333
## 2 30.3 9240934
## 3 32.0 10267083
## 4 34.0 11537966
## 5 36.1 13079460
## 6 38.4 14880372
## 7 39.9 12881816
## 8 40.8 13867957
## 9 41.7 16317921
## 10 41.8 22227415
## # … with 1,694 more rows
## filter(): 根据逻辑条件选取特定行的数据
# 常见的逻辑运算符号及其含义
############################
# 符号 # 含义
# < # 小于
# <= # 小于或等于
# > # 大于
# >= # 大于或等于
# == # 等于
# != # 不等于
# !x # 非x
# x | y # x或y
# x & y # x且y
# x %in% y # 检验x是否包含y
#############################
filter(gapminder, country == "China")
## # A tibble: 12 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 China Asia 1952 44 556263527 400.
## 2 China Asia 1957 50.5 637408000 576.
## 3 China Asia 1962 44.5 665770000 488.
## 4 China Asia 1967 58.4 754550000 613.
## 5 China Asia 1972 63.1 862030000 677.
## 6 China Asia 1977 64.0 943455000 741.
## 7 China Asia 1982 65.5 1000281000 962.
## 8 China Asia 1987 67.3 1084035000 1379.
## 9 China Asia 1992 68.7 1164970000 1656.
## 10 China Asia 1997 70.4 1230075000 2289.
## 11 China Asia 2002 72.0 1280400000 3119.
## 12 China Asia 2007 73.0 1318683096 4959.
filter(gapminder, continent == "Asia", year == 2007)
## # A tibble: 33 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975.
## 2 Bahrain Asia 2007 75.6 708573 29796.
## 3 Bangladesh Asia 2007 64.1 150448339 1391.
## 4 Cambodia Asia 2007 59.7 14131858 1714.
## 5 China Asia 2007 73.0 1318683096 4959.
## 6 Hong Kong, China Asia 2007 82.2 6980412 39725.
## 7 India Asia 2007 64.7 1110396331 2452.
## 8 Indonesia Asia 2007 70.6 223547000 3541.
## 9 Iran Asia 2007 71.0 69453570 11606.
## 10 Iraq Asia 2007 59.5 27499638 4471.
## # … with 23 more rows
filter(gapminder, lifeExp < 30)
## # A tibble: 2 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Rwanda Africa 1992 23.6 7290203 737.
filter(gapminder, country %in% c("China", "Japan", "United States"))
## # A tibble: 36 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 China Asia 1952 44 556263527 400.
## 2 China Asia 1957 50.5 637408000 576.
## 3 China Asia 1962 44.5 665770000 488.
## 4 China Asia 1967 58.4 754550000 613.
## 5 China Asia 1972 63.1 862030000 677.
## 6 China Asia 1977 64.0 943455000 741.
## 7 China Asia 1982 65.5 1000281000 962.
## 8 China Asia 1987 67.3 1084035000 1379.
## 9 China Asia 1992 68.7 1164970000 1656.
## 10 China Asia 1997 70.4 1230075000 2289.
## # … with 26 more rows
## arrange(): 根据变量(或列)对行进行重新排序
arrange(gapminder, pop)
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Sao Tome and Principe Africa 1952 46.5 60011 880.
## 2 Sao Tome and Principe Africa 1957 48.9 61325 861.
## 3 Djibouti Africa 1952 34.8 63149 2670.
## 4 Sao Tome and Principe Africa 1962 51.9 65345 1072.
## 5 Sao Tome and Principe Africa 1967 54.4 70787 1385.
## 6 Djibouti Africa 1957 37.3 71851 2865.
## 7 Sao Tome and Principe Africa 1972 56.5 76595 1533.
## 8 Sao Tome and Principe Africa 1977 58.6 86796 1738.
## 9 Djibouti Africa 1962 39.7 89898 3021.
## 10 Sao Tome and Principe Africa 1982 60.4 98593 1890.
## # … with 1,694 more rows
arrange(gapminder, desc(pop))
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 China Asia 2007 73.0 1318683096 4959.
## 2 China Asia 2002 72.0 1280400000 3119.
## 3 China Asia 1997 70.4 1230075000 2289.
## 4 China Asia 1992 68.7 1164970000 1656.
## 5 India Asia 2007 64.7 1110396331 2452.
## 6 China Asia 1987 67.3 1084035000 1379.
## 7 India Asia 2002 62.9 1034172547 1747.
## 8 China Asia 1982 65.5 1000281000 962.
## 9 India Asia 1997 61.8 959000000 1459.
## 10 China Asia 1977 64.0 943455000 741.
## # … with 1,694 more rows
# 先按gdpPercap再按pop排序
arrange(gapminder, gdpPercap, pop)
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Congo, Dem. Rep. Africa 2002 45.0 55379852 241.
## 2 Congo, Dem. Rep. Africa 2007 46.5 64606759 278.
## 3 Lesotho Africa 1952 42.1 748747 299.
## 4 Guinea-Bissau Africa 1952 32.5 580653 300.
## 5 Congo, Dem. Rep. Africa 1997 42.6 47798986 312.
## 6 Eritrea Africa 1952 35.9 1438760 329.
## 7 Myanmar Asia 1952 36.3 20092996 331
## 8 Lesotho Africa 1957 45.0 813338 336.
## 9 Burundi Africa 1952 39.0 2445618 339.
## 10 Eritrea Africa 1957 38.0 1542611 344.
## # … with 1,694 more rows
# 注意: 如果有缺失值(NA), 缺失值总是排在最后
## mutate(): 基于现有变量创建新的变量
mutate(gapminder, gdp = gdpPercap*pop)
## # A tibble: 1,704 x 7
## country continent year lifeExp pop gdpPercap gdp
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 6567086330.
## 2 Afghanistan Asia 1957 30.3 9240934 821. 7585448670.
## 3 Afghanistan Asia 1962 32.0 10267083 853. 8758855797.
## 4 Afghanistan Asia 1967 34.0 11537966 836. 9648014150.
## 5 Afghanistan Asia 1972 36.1 13079460 740. 9678553274.
## 6 Afghanistan Asia 1977 38.4 14880372 786. 11697659231.
## 7 Afghanistan Asia 1982 39.9 12881816 978. 12598563401.
## 8 Afghanistan Asia 1987 40.8 13867957 852. 11820990309.
## 9 Afghanistan Asia 1992 41.7 16317921 649. 10595901589.
## 10 Afghanistan Asia 1997 41.8 22227415 635. 14121995875.
## # … with 1,694 more rows
# 注意: transmute()函数也可以创建新的变量, 但是会抛弃其他变量
transmute(gapminder, gdp = gdpPercap*pop)
## # A tibble: 1,704 x 1
## gdp
## <dbl>
## 1 6567086330.
## 2 7585448670.
## 3 8758855797.
## 4 9648014150.
## 5 9678553274.
## 6 11697659231.
## 7 12598563401.
## 8 11820990309.
## 9 10595901589.
## 10 14121995875.
## # … with 1,694 more rows
# mutate()与case_when()联合使用, 可以对变量进行重新编码
gapminder2007 <- filter(gapminder, year == 2007)
summary(gapminder2007$gdpPercap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 277.6 1624.8 6124.4 11680.1 18008.8 49357.2
gapminder2007 <- mutate(
gapminder2007,
gdpPercap_class = case_when(
gdpPercap >= 18008.8 ~ "High Income",
gdpPercap < 1624.8 ~ "Low Income",
gdpPercap >= 1624.8 & gdpPercap < 18008.8 ~ "Middle Income"
)
)
## summarize()/summarise(): 用于计算变量的描述统计值
summarize(gapminder, mean(gdpPercap))
## # A tibble: 1 x 1
## `mean(gdpPercap)`
## <dbl>
## 1 7215.
summarize(gapminder,
lifeExp_avg = mean(lifeExp),
pop_avg = mean(pop),
gdpPercap_avg = mean(gdpPercap))
## # A tibble: 1 x 3
## lifeExp_avg pop_avg gdpPercap_avg
## <dbl> <dbl> <dbl>
## 1 59.5 29601212. 7215.
# summarize()通常结合group_by()一起使用
# 按照年份分组
gapminder_year <- group_by(gapminder, year)
summarize(gapminder_year,
lifeExp_avg = mean(lifeExp),
lifeExp_med = median(lifeExp))
## # A tibble: 12 x 3
## year lifeExp_avg lifeExp_med
## <int> <dbl> <dbl>
## 1 1952 49.1 45.1
## 2 1957 51.5 48.4
## 3 1962 53.6 50.9
## 4 1967 55.7 53.8
## 5 1972 57.6 56.5
## 6 1977 59.6 59.7
## 7 1982 61.5 62.4
## 8 1987 63.2 65.8
## 9 1992 64.2 67.7
## 10 1997 65.0 69.4
## 11 2002 65.7 70.8
## 12 2007 67.0 71.9
# 按照各洲分组
gapminder_continent <- group_by(gapminder, continent)
summarize(gapminder_continent,
count = n(),
lifeExp_min = min(lifeExp),
lifeExp_max = max(lifeExp),
lifeExp_avg = mean(lifeExp))
## # A tibble: 5 x 5
## continent count lifeExp_min lifeExp_max lifeExp_avg
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Africa 624 23.6 76.4 48.9
## 2 Americas 300 37.6 80.7 64.7
## 3 Asia 396 28.8 82.6 60.1
## 4 Europe 360 43.6 81.8 71.9
## 5 Oceania 24 69.1 81.2 74.3
## %>%: 管道操作符
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(count = n(),
lifeExp_min = min(lifeExp),
lifeExp_max = max(lifeExp),
lifeExp_avg = mean(lifeExp))
## # A tibble: 5 x 5
## continent count lifeExp_min lifeExp_max lifeExp_avg
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Africa 52 39.6 76.4 54.8
## 2 Americas 25 60.9 80.7 73.6
## 3 Asia 33 43.8 82.6 70.7
## 4 Europe 30 71.8 81.8 77.6
## 5 Oceania 2 80.2 81.2 80.7
# 注: 如果%>%不起作用, 可以安装并加载magrittr
## 快捷键
# Rstudio shortcut for %>%
# mac: shift + command + M
# win: shift + control + M
### 6. 数据可视化
library(ggplot2)
## ggplot2语法的关键构件:
# 1. 用于绘图的DATA(数据层)
# 2. 数据映射AESTHETIC(美学层)
# 3. 图形对象GEOMETRIC(图形层)
## 绘图模版
# ggplot(data = DATA, mapping = aes(MAPPINGS)) + GEOM_FUCTION()
# 或者
# ggplot(data = DATA) + GEOM_FUCTION(mapping = aes(MAPPINGS))
# 或者
# p <- ggplot(data = DATA, mapping = aes(MAPPINGS))
# p + GEOM_FUCTION()
## 1.直方图
ggplot(data = gapminder2007, mapping = aes(x = lifeExp))

# 添加图形
ggplot(gapminder2007, aes(lifeExp)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 调整条的宽度
ggplot(gapminder2007, aes(lifeExp)) + geom_histogram(bins = 20) # 调整数量

ggplot(gapminder, aes(lifeExp)) + geom_histogram(binwidth = 6) # 调整宽度

# 设置条及其边框的颜色
ggplot(gapminder2007, aes(lifeExp)) +
geom_histogram(fill = "lightblue", color = "black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 添加主标题/x轴标签/更换主题模版
ggplot(gapminder2007, aes(lifeExp)) +
geom_histogram(fill = "lightblue", color = "white") +
ggtitle("Distribution of Life Expectancy, 2007") +
xlab("Life Expectancy") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 条形图
ggplot(gapminder2007, aes(continent)) + geom_bar()

ggplot(gapminder2007, aes(continent, fill = continent)) +
geom_bar(alpha = 0.5) +
ggtitle("Number of Countries on Each Continent") +
xlab("") + guides(fill = FALSE) +
theme_minimal() +
theme(panel.grid.major.x = element_blank())

gapminder2007 %>%
count(continent) %>%
mutate(continent = reorder(continent, n)) %>%
ggplot(aes(continent, n)) +
geom_col(fill = "skyblue3", alpha = 0.8) +
labs(title = "Number of Countries on Each Continent",
x = "", y = "Frequency") +
theme_bw()

## 箱线图
ggplot(gapminder2007, aes(continent, lifeExp)) + geom_boxplot()

ggplot(gapminder2007, aes(reorder(continent, lifeExp, median), lifeExp, fill = continent)) +
geom_boxplot(alpha = 0.6) +
labs(title = "Life Expectancy Comparison, 2007",
xlab ="", y = "Life Expectancy") +
guides(fill = FALSE) +
theme_minimal()

## 折线图
gapminder %>% ggplot(aes(year, lifeExp, group = country)) +
geom_line()

gapminder %>%
filter(country %in% c("Rwanda", "Iraq")) %>%
ggplot(aes(year, lifeExp, group = country, color = country)) +
geom_line(size = 1.2) +
labs(title = "Two Countries Comparison",
x = "", y= "Life Expectancy") +
theme_minimal()

## 散点图
p <- ggplot(gapminder2007, aes(gdpPercap, lifeExp)) + geom_point(shape = 1); p

p + geom_smooth() # 添加loess曲线 (defalt: method = loess)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(gapminder2007, aes(log(gdpPercap), lifeExp)) +
geom_point(shape = 1) +
geom_smooth(method = lm) # 添加回归线

ggplot(gapminder2007, aes(log(gdpPercap), lifeExp, color = continent)) +
geom_point(size = 3.5, alpha = 0.5) +
geom_smooth(method = lm, se = FALSE) +
labs(x = "GDP per capita (log)", y = "Life Expectancy") +
theme_minimal()

gapminder2007 %>%
filter(continent != "Oceania") %>%
ggplot(aes(log(gdpPercap), lifeExp)) +
geom_point(size = 3.5, alpha = 0.5) +
geom_smooth(method = lm, se = FALSE) +
labs(x = "GDP per capita (log)", y = "Life Expectancy") +
theme_bw() +
facet_wrap(~ continent)

## 保存图形
# 安装和加载showtext包用于显示中文字体
# install.packages("showtext")
library(showtext)
## Loading required package: sysfonts
## Loading required package: showtextdb
showtext_auto(enable = TRUE)
# 保存最近绘制的一张图
ggsave("my_graph.pdf", width = 10, height = 8)
ggsave("my_graph.png", width = 7, height = 4)
# 保存已存储在r上的图形
my_plot <- ggplot(gapminder2007, aes(lifeExp, fill = gdpPercap_class)) +
geom_density(alpha = 0.6) + theme_bw(); my_plot

ggsave("my_graph2.pdf", plot = my_plot, width = 10, height = 8)
## 练习4
# 使用ggplot自带的数据集diamonds绘制上述几种图形(折线图除外)
### 7. 回归分析
## 简单回归
# 简单回归模型: Yi = β0 + β*Xi + εi
# i = 1,…,n
# Yi = 因变量的取值
# β0 = 截距(intercept)
# β = 斜率(slope)
# Xi = 自变量的取值
# εi = 随机方差或残差
# 自变量为数值变量(连续变量)
fit <- lm(lifeExp ~ gdpPercap, data = gapminder2007)
summary(fit)
##
## Call:
## lm(formula = lifeExp ~ gdpPercap, data = gapminder2007)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.828 -6.316 1.922 6.898 13.128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.957e+01 1.010e+00 58.95 <2e-16 ***
## gdpPercap 6.371e-04 5.827e-05 10.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.899 on 140 degrees of freedom
## Multiple R-squared: 0.4606, Adjusted R-squared: 0.4567
## F-statistic: 119.5 on 1 and 140 DF, p-value: < 2.2e-16
# 解读:
# Estimate: 自变量gdpPercap每增加1个单位,因变量lifeExp平均增加0.637岁
# p-value: 在0.05显著性水平下是显著的
# R-squared: 自变量能够解释的因变量变化的比例
fit <- lm(lifeExp ~ log(gdpPercap), data = gapminder2007)
summary(fit)
##
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = gapminder2007)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.947 -2.661 1.215 4.469 13.115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9496 3.8577 1.283 0.202
## log(gdpPercap) 7.2028 0.4423 16.283 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.122 on 140 degrees of freedom
## Multiple R-squared: 0.6544, Adjusted R-squared: 0.652
## F-statistic: 265.2 on 1 and 140 DF, p-value: < 2.2e-16
# NOTE: 自变量和不同情况下取对数的解释
# 因变量Y取对数,自变量X不取对数,参数β解释为:x变动1个单位,y平均变动100*β%
# 因变量Y不取对数,自变量X取对数,参数β解释为:x变动1%,y平均变动0.01*β个单位
# 因变量Y取对数,自变量X也取对数,参数β解释为:x变动1%,y平均变动β%
# R方:回归线 vs. 均值线
gapminder2007 %>%
ggplot(aes(log(gdpPercap), lifeExp)) +
geom_point(size = 3, alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) + theme_bw() +
geom_hline(yintercept = mean(gapminder2007$lifeExp), color = "red")

# 自变量为类别变量(连续变量)
# 设定类别变量因子水平的顺序
gapminder2007$gdpPercap_class <- factor(gapminder2007$gdpPercap_class,
levels = c("Low Income","Middle Income", "High Income"))
fit <- lm(lifeExp ~ gdpPercap_class, data = gapminder2007)
summary(fit)
##
## Call:
## lm(formula = lifeExp ~ gdpPercap_class, data = gapminder2007)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.362 -3.502 1.692 4.922 14.172
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.125 1.293 41.095 <2e-16 ***
## gdpPercap_classMiddle Income 14.850 1.591 9.335 <2e-16 ***
## gdpPercap_classHigh Income 25.885 1.828 14.159 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.756 on 139 degrees of freedom
## Multiple R-squared: 0.5931, Adjusted R-squared: 0.5873
## F-statistic: 101.3 on 2 and 139 DF, p-value: < 2.2e-16
# 解读:以Low Income为基准,因变量的取值在自变量取类别Middle Income时,比取基准类别时平均高14.85岁 (取High Income时高25.885)
# 虚拟变量的原理
class_avg <- gapminder2007 %>%
group_by(gdpPercap_class) %>%
summarise(lifeExp_avg = mean(lifeExp))
gapminder2007 %>%
ggplot(aes(gdpPercap_class, lifeExp)) +
geom_point(size = 2, alpha = 0.3) + xlab("") + theme_bw() +
annotate("segment", color = "red", linetype = "dashed",
x = c(0, 0, 0), xend = c(1, 2, 3),
y = class_avg$lifeExp_avg, yend = class_avg$lifeExp_avg) +
annotate("text", x = c(0.15, 0.15, 0.15), y = class_avg$lifeExp_avg + 1,
label = round(class_avg$lifeExp_avg, 2)) +
annotate("segment", color = "blue", x = c(1, 1), xend = c(2, 3),
y = class_avg$lifeExp_avg[c(1, 1)], yend = class_avg$lifeExp_avg[c(2, 3)])

## 多元回归
# 多元回归中自变量有多个
model1 <- lm(lifeExp ~ log(gdpPercap), data = gapminder2007); summary(model1)
##
## Call:
## lm(formula = lifeExp ~ log(gdpPercap), data = gapminder2007)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.947 -2.661 1.215 4.469 13.115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9496 3.8577 1.283 0.202
## log(gdpPercap) 7.2028 0.4423 16.283 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.122 on 140 degrees of freedom
## Multiple R-squared: 0.6544, Adjusted R-squared: 0.652
## F-statistic: 265.2 on 1 and 140 DF, p-value: < 2.2e-16
model2 <- lm(lifeExp ~ log(gdpPercap) + log(pop), data = gapminder2007); summary(model2)
##
## Call:
## lm(formula = lifeExp ~ log(gdpPercap) + log(pop), data = gapminder2007)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.0435 -2.0693 0.9482 4.7514 12.8927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.6161 7.5379 -1.143 0.2550
## log(gdpPercap) 7.2445 0.4376 16.555 <2e-16 ***
## log(pop) 0.8114 0.3889 2.086 0.0388 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.038 on 139 degrees of freedom
## Multiple R-squared: 0.6649, Adjusted R-squared: 0.6601
## F-statistic: 137.9 on 2 and 139 DF, p-value: < 2.2e-16
model3 <- lm(lifeExp ~ log(gdpPercap) + log(pop) + continent, data = gapminder2007); summary(model3)
##
## Call:
## lm(formula = lifeExp ~ log(gdpPercap) + log(pop) + continent,
## data = gapminder2007)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.4248 -2.2458 -0.0139 2.4683 14.9571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.41824 7.45571 2.604 0.01023 *
## log(gdpPercap) 4.64155 0.53758 8.634 1.46e-14 ***
## log(pop) 0.04029 0.35073 0.115 0.90871
## continentAmericas 11.65638 1.69293 6.885 1.98e-10 ***
## continentAsia 10.05215 1.57756 6.372 2.73e-09 ***
## continentEurope 11.23199 1.92647 5.830 3.88e-08 ***
## continentOceania 12.89181 4.54931 2.834 0.00531 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.951 on 135 degrees of freedom
## Multiple R-squared: 0.7674, Adjusted R-squared: 0.7571
## F-statistic: 74.23 on 6 and 135 DF, p-value: < 2.2e-16
# 对Estimate的解读增加“在控制其他自变量不变的条件下”,其他表述与简单回归类似
## 回归诊断
# 线性回归假设:
# (1) 自变量与因变量关系的线性假设
# (2) 自变量相互独立
# (3) 误差项不相关
# (4) 方差齐性,即不存在异方差现象(heteroskedasticity)
# NOTE: 解决异方差性的通常方法是对因变量取对数
par(mfrow = c(2, 2))
plot(model3, which = c(1:4))

## 使用表格形式报告结果
# install.packages("stargazer")
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
# 输出text格式
stargazer(model1, model2, model3, type = "text")
##
## =============================================================================================
## Dependent variable:
## -------------------------------------------------------------------------
## lifeExp
## (1) (2) (3)
## ---------------------------------------------------------------------------------------------
## log(gdpPercap) 7.203*** 7.245*** 4.642***
## (0.442) (0.438) (0.538)
##
## log(pop) 0.811** 0.040
## (0.389) (0.351)
##
## continentAmericas 11.656***
## (1.693)
##
## continentAsia 10.052***
## (1.578)
##
## continentEurope 11.232***
## (1.926)
##
## continentOceania 12.892***
## (4.549)
##
## Constant 4.950 -8.616 19.418**
## (3.858) (7.538) (7.456)
##
## ---------------------------------------------------------------------------------------------
## Observations 142 142 142
## R2 0.654 0.665 0.767
## Adjusted R2 0.652 0.660 0.757
## Residual Std. Error 7.122 (df = 140) 7.038 (df = 139) 5.951 (df = 135)
## F Statistic 265.150*** (df = 1; 140) 137.925*** (df = 2; 139) 74.228*** (df = 6; 135)
## =============================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# 输出html格式并保存为lifeExp_model
# stargazer(model1, model2, model3, type = "html", out = "lifeExp_model.htm")
# NOTE: Microsoft Word可以读入 *.htm文档, 并进行编辑修改
## 使用图形报告结果
# install.packages(c("dotwhisker", "broom"))
library(dotwhisker)
library(broom)
# 绘制模型3的点须图
dwplot(model3)

# 绘制模型1-3的点须图
dwplot(list(model1, model2, model3))

# 模型3黑白版的点须图
dwplot(model3, whisker_args = list(lwd = 0.8, color = "black"),
dot_args = list(size = 3, pch = 21, fill = "black", color = "black")) %>%
relabel_predictors(c("log(gdpPercap)" = "GDP per capita(log)",
"log(pop)" = "population(log)",
"continentAmericas" = "Continent: Americas",
"continentAsia" = "Continent: Asia",
"continentEurope" = "Continent: Europe",
"continentOceania" = "Continent: Oceania")) +
geom_vline(xintercept = 0, color = "grey60", linetype = 2) +
xlab("Coefficient") + theme_bw() +
theme(legend.position = "none")
